Load and inspect MVP voting data
library(readr)
mvp_voting <- read_csv('Data/mvp_voting.csv')
Rows: 719 Columns: 21
── Column specification ────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Rank, Player, Tm
dbl (18): Age, First, Pts Won, Pts Max, Share, G, MP, PTS, TRB, AST, STL, BLK, FG%, 3P%, FT%, WS, WS...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
mvp_voting
Select only columns pertaining to MVP voting results
library(dplyr)
mvp_voting <- mvp_voting %>% select(Player, Year, `Pts Won`, `Pts Max`, Share)
mvp_voting
Load and inspect player stats
player_stats <- read_csv('Data/player_stats.csv')
Rows: 23881 Columns: 31
── Column specification ────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Player, Pos, Tm
dbl (28): Rk, Age, G, GS, MP, FG, FGA, FG%, 3P, 3PA, 3P%, 2P, 2PA, 2P%, eFG%, FT, FTA, FT%, ORB, DRB...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
player_stats
Remove Rank column
player_stats <- player_stats %>% select(-Rk)
player_stats
Remove asterisks after names
library(stringr)
player_stats$Player <- str_replace_all(player_stats$Player, fixed("*"), "")
player_stats
Convert NA values for percentages to zeros. This also converts games started to zeros for those predating when that metric began being tracked. I will not use this column for my models, so it should have no impact
player_stats <- player_stats %>% mutate(across(everything(), ~ replace_na(.x, 0)))
Group the dataframe by the combined player and year. Then,
handle_multiple_teams <- function(df) {
if (nrow(df) == 1) {
return(df)
}
else {
row <- df %>% filter(Tm == 'TOT')
if (nrow(row) == 0) {
return(df)
}
row$Tm <- as.character(df[nrow(df), "Tm"])
return(row)
}
}
player_stats$Tm <- as.character(player_stats$Tm)
player_stats <- player_stats %>% group_by(Player, Year) %>% group_modify(~ handle_multiple_teams(.x))
player_stats <- player_stats %>% ungroup()
player_stats
Merge MVP voting with player stats
player_stats_with_mvp_voting <- full_join(player_stats, mvp_voting, by = c("Player" = "Player", "Year" = "Year")) %>% mutate(
`Pts Won` = replace_na(`Pts Won`, 0),
`Pts Max` = replace_na(`Pts Max`, 0),
Share = replace_na(Share, 0)
)
player_stats_with_mvp_voting
Load and inspect team stats
team_stats = read_csv('Data/team_stats.csv')
Rows: 1254 Columns: 9
── Column specification ────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): GB, Team
dbl (7): W, L, W/L%, PS/G, PA/G, SRS, Year
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
team_stats
Remove asterisks and seeds from team names
team_stats$Team <- str_replace_all(team_stats$Team, fixed("*"), "")
team_stats$Team <- str_replace_all(team_stats$Team, "\\([^\\)]+\\)", "")
team_stats$Team <- str_squish(team_stats$Team)
team_stats
Change dashes for games back to zeros
team_stats <- team_stats %>% mutate(GB = str_replace_all(GB, '—', '0'))
team_stats
Convert games back from characters to numeric
team_stats <- team_stats %>% mutate(GB = as.numeric(GB))
team_stats
Load mapping from full name to abbreviation
abbreviations <- list()
lines <- read_lines("Data/abbreviations.csv")
for (line in lines[-1]) {
split_line <- strsplit(line, ",")[[1]]
abbreviation <- split_line[1]
name <- split_line[2]
abbreviations[[abbreviation]] <- name
}
Add full names to player stats with MVP voting
player_stats_with_mvp_voting <- player_stats_with_mvp_voting %>% mutate(Team = recode(Tm, !!!abbreviations))
player_stats_with_mvp_voting
Merge player stats with MPV voting with team stats
everything <- full_join(player_stats_with_mvp_voting, team_stats, by = c("Team" = "Team", "Year" = "Year"))
everything
Save combined stats to csv
write_csv(everything, 'Data/combined_stats.csv')